library(dplyr)
library(ggplot2)
library(readr)
dataset <- read_csv("new_dataset.csv")
Rows: 568 Columns: 648── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (647): psd_1, psd_2, psd_3, psd_4, psd_5, psd_6, psd_7, psd_8, psd_9, psd_10, psd_11, psd_12, psd_13, psd_14, psd_15, psd_16, psd_17, psd_18, psd_19, psd_20, psd_21, psd_22, psd_23,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(caret)
Loading required package: lattice
nzv <- nearZeroVar(dataset, saveMetrics = FALSE)
dataset <- dataset[, -nzv]
We will split the dataset into a training set (70%) and a testing set (30%).
set.seed(123) # for reproducibility
sample_index <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_data <- dataset[sample_index, ]
test_data <- dataset[-sample_index, ]
Using the glmnet package, we’ll predict the tmg
feature
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1-8
y <- train_data$tmg
x <- train_data %>% select(-name,-tmg) %>% as.matrix()
Standardization (Z-score normalization): This method transforms each feature to have a mean of 0 and a standard deviation of 1. It’s particularly useful when your features have different units or very different scales.
library(caret)
scaled_x <- preProcess(x, method = c("center", "scale"))
x <- predict(scaled_x, x)
In the context of Elastic Net regression, the alpha
parameter is a crucial component that balances the mix between Lasso
(L1) and Ridge (L2) regularization methods. Elastic Net is a
regularization technique that combines both L1 and L2 penalties, which
are used to prevent overfitting by adding a penalty to the model’s loss
function.
Here’s a breakdown of the alpha parameter:
Range: alpha can take on any value
between 0 and 1 (inclusive).
alpha = 1: The penalty is entirely Lasso (L1
regularization).alpha = 0: The penalty is entirely Ridge (L2
regularization).alpha between 0 and 1: A combination of Lasso and
Ridge.Effect of L1 (Lasso) Regularization: L1 regularization adds a penalty equal to the absolute value of the magnitude of coefficients. This can lead to some coefficients being exactly zero, which is useful for feature selection if you have a large number of features.
Effect of L2 (Ridge) Regularization: L2 regularization adds a penalty equal to the square of the magnitude of coefficients. This tends to shrink the coefficients but does not set them to zero, which is useful when you have correlated features.
Choosing alpha:
alpha value is through
cross-validation, trying different values and selecting the one that
minimizes prediction error.Interaction with lambda: The
lambda parameter in Elastic Net controls the overall
strength of the penalty. So, the effect of alpha is in
conjunction with lambda. A grid search over both
alpha and lambda is a common practice to find
the best combination that minimizes cross-validation error.
In summary, the alpha parameter in Elastic Net allows
you to balance the type of regularization applied to your model,
providing the flexibility to choose between Lasso, Ridge, or a mix of
both based on your data and the specific requirements of your
problem.
# Elastic Net model
set.seed(123)
cv_model <- cv.glmnet(x, y, alpha = 1) # alpha=0.5 indicates Elastic Net
best_lambda <- cv_model$lambda.min
model_en <- glmnet(x, y, alpha = 1, lambda = best_lambda)
print(model_en)
Call: glmnet(x = x, y = y, alpha = 1, lambda = best_lambda)
Df %Dev Lambda
1 50 86.09 0.000727
foldid <- sample(1:10, size = length(y), replace = TRUE)
cv1 <- cv.glmnet(x, y, foldid = foldid, alpha = 1)
cv.5 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.5)
cv0 <- cv.glmnet(x, y, foldid = foldid, alpha = 0)
par(mfrow = c(2,2))
Warning: unable to load shared object '/usr/local/lib/R/modules//R_X11.so':
libXt.so.6: cannot open shared object file: No such file or directory
plot(cv1);
title("alpha=1")
plot(cv.5, );
title("alpha=0.5")
plot(cv0)
title("alpha=0")
plot(log(cv1$lambda) , cv1$cvm , pch = 19, col = "red", xlab = "Log(λ)", ylab = cv1$name)
points(log(cv.5$lambda), cv.5$cvm, pch = 19, col = "grey")
points(log(cv0$lambda) , cv0$cvm , pch = 19, col = "blue")
legend("topleft", legend = c("alpha= 1", "alpha= .5", "alpha 0"), pch = 19, col = c("red","grey","blue"))
foldid <- sample(1:10, size = length(y), replace = TRUE)
cv1.00 <- cv.glmnet(x, y, foldid = foldid, alpha = 1.00)
cv0.75 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.75)
cv0.50 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.50)
cv0.25 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.25)
cv0.00 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.00)
par(mfrow = c(2,3))
plot(cv1.00); title("alpha=1.00")
plot(cv0.75); title("alpha=0.75")
plot(cv0.50); title("alpha=0.50")
plot(cv0.25); title("alpha=0.25")
plot(cv0.00); title("alpha=0.00")
plot(log(cv1.00$lambda), cv1.00$cvm , pch = 19, col = "red", xlab = "Log(λ)", ylab = cv1$name)
points(log(cv0.75$lambda), cv0.75$cvm, pch = 19, col = "green")
points(log(cv0.50$lambda), cv0.50$cvm, pch = 19, col = "grey")
points(log(cv0.25$lambda), cv0.25$cvm, pch = 19, col = "orange")
points(log(cv0.00$lambda), cv0.00$cvm, pch = 19, col = "blue")
alphas = c("alpha=1.00", "alpha=0.75", "alpha=0.50", "alpha=0.25", "alpha=0.00")
legend("topleft", legend = alphas, pch = 19, col = c("red","green","grey","orange","blue"))
cvm=c(min(cv1.00$cvm), min(cv0.75$cvm), min(cv0.50$cvm), min(cv0.25$cvm), min(cv0.00$cvm))
loglambda=c(log(cv1.00$lambda[which.min(cv1.00$cvm)]),log(cv0.75$lambda[which.min(cv0.75$cvm)]), log(cv0.50$lambda[which.min(cv0.50$cvm)]), log(cv0.25$lambda[which.min(cv0.25$cvm)]), log(cv0.00$lambda[which.min(cv0.00$cvm)]))
data.frame(alphas=alphas, CVM=cvm, minCVMLogLambda=loglambda)
# install.packages("parallel")
library(parallel)
foldid <- sample(1:10, size = length(y), replace = TRUE)
test_alpha <- function(alpha){
net <- cv.glmnet(x, y, foldid = foldid, alpha = alpha)
per05 <- apply(cbind(a=net$cvm, b=log(net$lambda)), 2, quantile, probs = 0.05)
per10 <- apply(cbind(a=net$cvm, b=log(net$lambda)), 2, quantile, probs = 0.10)
per15 <- apply(cbind(a=net$cvm, b=log(net$lambda)), 2, quantile, probs = 0.15)
per20 <- apply(cbind(a=net$cvm, b=log(net$lambda)), 2, quantile, probs = 0.20)
return(data.frame(
alpha=alpha,
minCVM=min(net$cvm),
bestLambda=log(net$lambda[which.min(net$cvm)]),
per05CVM=per05[1],
per10CVM=per10[1],
per15CVM=per15[1],
per20CVM=per20[1]
))
}
test_alphas <- seq(from = 0, to = 1, length.out = 1000)
test_results <- bind_rows(mclapply(test_alphas, test_alpha, mc.cores=32))
# test_results <- bind_rows(lapply(test_alphas, test_alpha))
print(test_results)
test_results
# par(mfrow = c(2, 1))
plot(x=test_results$alpha, y=test_results$bestLambda, ylab="bestLambda")
plot(x=test_results$alpha, y=test_results$minCVM, ylab="00 percentile")
# plot(x=test_results$alpha, y=test_results$per05CVM, ylab="05 percentile")
# plot(x=test_results$alpha, y=test_results$per10CVM, ylab="10 percentile")
# plot(x=test_results$alpha, y=test_results$per15CVM, ylab="15 percentile")
# plot(x=test_results$alpha, y=test_results$per20CVM, ylab="20 percentile")
# The variable importance is inferred from the coefficients
calculate_top_importance <- function(model, top_n){
c <- coef(model)
predictor <- c %>% rownames()
importance <- c %>% as.matrix()
importance <- data.frame(predictor,importance)
rownames(importance) <- NULL
names(importance)[2] <- "importance"
importance<-importance[-1,] # Exclude intercept
importance <- importance[order(-importance$importance), ]
importance <- head(importance, top_n)
return(importance)
}
alpha0 <- cv.glmnet(x, y, foldid = foldid, alpha = 0.00)
alpha1 <- cv.glmnet(x, y, foldid = foldid, alpha = 1.00)
top0 <- calculate_top_importance(alpha0, 20)
top1 <- calculate_top_importance(alpha1, 20)
library(dplyr)
library(ggplot2)
plot_importance <- function(importance){
plot_perm <-importance %>% mutate(predictor = factor(predictor, levels = rev(unique(predictor)))) %>%
ggplot()+
geom_col(aes(y=predictor,x=importance),fill='darkblue', color='gray')+
ggtitle("Top 20 predictor importance using glmnet")+
theme_minimal()
return(plot_perm)
}
plot_importance(top0)
plot_importance(top1)
get_rmse <- function(data){
x_test <- data %>% select(-name,-tmg) %>% as.matrix()
y_test <- data$tmg
x_test <-predict(scaled_x, x_test)
predictions <- predict(model_en, s = best_lambda, newx = x_test)
RMSE <- sqrt(mean((predictions - y_test)^2))
return(RMSE)
}
print(get_rmse(test_data))
[1] 0.03531884
print(get_rmse(dataset))
[1] 0.03187589
library(readr)
library(caret)
Loading required package: ggplot2
Loading required package: lattice
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1-8
library(caret)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
dataset <- read_csv("new_dataset.csv")
Rows: 568 Columns: 648── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (647): psd_1, psd_2, psd_3, psd_4, psd_5, psd_6, psd_7, psd_8, psd_9, psd_10, psd_11, psd_12, psd_13, psd_14, psd_15, psd_16, psd_17, psd_18, psd_19, psd_20, psd_21, psd_22, psd_23,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nzv <- nearZeroVar(dataset, saveMetrics = FALSE)
dataset <- dataset[, -nzv]
set.seed(123) # for reproducibility
sample_index <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_data <- dataset[sample_index, ]
test_data <- dataset[-sample_index, ]
y_train <- train_data$tmg
x_train <- train_data %>% select(-name,-tmg) %>% as.matrix()
scaled_x <- preProcess(x_train, method = c("center", "scale"))
x_train <- predict(scaled_x, x_train)
set.seed(123)
alpha = 1
foldid <- sample(1:10, size = length(y_train), replace = TRUE)
foldid
[1] 3 3 10 2 6 5 4 6 9 10 5 3 9 9 9 3 8 10 7 10 9 3 4 1 7 5 10 7 9 9 10 7 5 7 5 6 9 2 5 8 2 1 9 9 6 5 9 10 4 6 8 6 6 7 1 6 2 1 2 4 5
[62] 6 3 9 4 6 9 9 7 3 8 9 3 7 3 7 6 10 5 5 8 3 10 2 10 2 10 6 4 1 6 3 8 3 8 1 7 7 7 10 6 7 10 5 6 8 5 7 4 3 9 7 6 10 9 7 2 3 8 4 7 4
[123] 1 8 4 9 8 6 4 8 3 4 4 6 1 10 4 9 7 8 5 2 6 9 8 10 4 5 7 1 8 8 10 9 8 2 5 9 7 7 10 10 8 6 7 10 1 9 3 10 5 6 9 4 10 7 9 7 10 4 8 9 9
[184] 9 5 7 6 1 10 10 1 10 1 10 5 7 5 10 9 4 6 2 1 5 9 4 3 9 1 2 4 10 1 5 5 9 8 7 9 5 2 10 6 7 9 1 5 5 8 5 7 4 2 1 1 2 1 2 8 1 3 2 2 5
[245] 9 10 6 10 10 6 3 4 3 7 3 2 7 3 9 7 9 4 2 6 10 9 7 5 1 3 2 5 1 3 9 2 6 3 3 9 2 1 6 10 8 10 10 4 6 4 4 9 8 7 10 1 2 8 3 10 7 1 2 3 3
[306] 6 10 5 4 2 1 1 5 2 2 3 5 6 10 3 3 1 8 1 4 8 10 10 8 7 2 1 4 9 5 8 2 5 2 6 10 10 9 7 6 3 8 7 4 8 3 8 6 4 7 10 8 4 10 9 6 3 4 9 9 4
[367] 4 6 5 1 8 7 5 6 3 8 10 5 7 4 9 3 10 7 2 9 6 4 6 2 2 8 8 5 9 7 3
cv_model <- cv.glmnet(x_train, y_train, foldid = foldid, alpha = alpha)
best_lambda <- cv_model$lambda.min
model_en <- glmnet(x_train, y_train, alpha = alpha, foldid = foldid, lambda = best_lambda)
do_plot <- function(data, type) {
x_eval <- data %>% select(-name,-tmg) %>% as.matrix()
y_eval <- data$tmg
x_eval <- predict(scaled_x, x_eval)
predictions <- predict(model_en, s = best_lambda, newx = x_eval)
RMSE <- sqrt(mean((predictions - y_eval)^2))
print(paste0(RMSE, type))
results <- data.frame(Reference = y_eval, Predicted = as.vector(predictions))
return(ggplot(results, aes(x = Reference, y = Predicted)) +
geom_point(color='blue') +
geom_abline(intercept = 0, slope = 1, color='red') +
ggtitle(paste(type, "glmnet; RMSE:", RMSE)) +
xlab("Reference Values") +
ylab("Predicted Values") +
theme_bw())
}
do_plot(test_data, "Test")
[1] "0.0353347640606086Test"
do_plot(train_data, "Train")
[1] "0.0309269148825548Train"
do_plot(dataset, "Total")
[1] "0.032317240545742Total"
best_lambda
[1] 0.0009610472
# install.packages("gridExtra")
library(gridExtra)
library(parallel)
plot_column <- function(column, importance){
results <- data.frame(x = dataset$tmg, y = dataset[[column]])
plt <- ggplot(results, aes(x = x, y = y)) +
geom_point(color='blue') +
xlab("") +
ylab(column) +
theme_bw()
return(plt)
}
plot_important <- function(important, title, n_to_plot=6, columns=2) grid.arrange(top=title, grobs = mclapply(important$predictor[1:n_to_plot], plot_column, importance=important$importance[1:n_to_plot]), ncol = columns)
# plot_list1 <- lapply(top1$predictor[1:6], plot_column)
# grid.arrange(top="Alpha=1", grobs = plot_list1, ncol = 2)
# plot_list0 <- lapply(top0$predictor[1:6], plot_column)
# grid.arrange(top="Alpha=0", grobs = plot_list0, ncol = 2)
top0
png("alpha0.png", width = 1000, height = 1200)
plot_important(top0, n_to_plot=20, columns=4, "Alpha=0")
dev.off()
null device
1
png("alpha1.png", width = 1000, height = 1200)
plot_important(top1, n_to_plot=20, columns=4, "Alpha=1")
dev.off()
null device
1
full_dataset <- read_csv("new_dataset.csv")
Rows: 568 Columns: 648── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (647): psd_1, psd_2, psd_3, psd_4, psd_5, psd_6, psd_7, psd_8, psd_9, psd_10, psd_11, psd_12, psd_13, psd_14, psd_15, psd_16, psd_17, psd_18...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
full_dataset
selected_columns <- full_dataset[, grep("(^psd_)", names(full_dataset))]
plot(t(selected_columns[1,]))